library(tidyverse)
library(janitor)
library(kableExtra)
nhanes <- read_csv("data/nhanes.csv") %>%
clean_names()
selectnhanes %>%
select(age)
select variablesSelect multiple variables by listing them:
nhanes %>%
select(height, weight)
select variablesYou can use one_of() to do the same thing:
nhanes %>%
select(one_of(c("height", "weight")))
select variablescontain() selects variable with certain text in the variable name:
nhanes %>%
select(contains("age"))
select variablesYou can select a range of columns using the var1:var2 pattern
nhanes %>%
select(weight:bmi)
select variablesDrop variables using the -var format:
nhanes %>%
select(-id)
select variablesDrop a set of variables using the -(var1:var2) format:
nhanes %>%
select(-(id:education))
mutatenhanes %>%
mutate(country = "United States") %>%
select(country)
nhanes %>%
mutate(height_inches = height / 2.54) %>%
select(contains("height"))
nhanes %>%
mutate(bmi = round(bmi, digits = 1)) %>%
select(bmi)
filterfilternhanes %>%
filter(gender == "female") %>%
select(gender)
filternhanes %>%
filter(health_gen != "Good") %>%
select(health_gen)
filternhanes %>%
filter(health_gen != "Good" | health_gen != "VGood" | health_gen != "Excellent") %>%
select(health_gen)
filternhanes %>%
filter(health_gen %in% c("Good", "VGood", "Excellent")) %>%
select(health_gen)
filterYou can chain together multiple filter functions. Doing it this way, we don’t have create complex logic in one line.
nhanes %>%
filter(gender == "male") %>%
filter(health_gen %in% c("Good", "VGood", "Excellent")) %>%
select(gender, health_gen)
filternhanes %>%
filter(age > 50)
filterYou can drop NAs with !is.na()
nhanes %>%
filter(age > 50) %>%
filter(!is.na(marital_status)) %>%
select(age, marital_status)
filterYou can also drop NAs with drop_na()
nhanes %>%
filter(age > 50) %>%
drop_na(marital_status) %>%
select(age, marital_status)
summarizeThis doesn’t work! Notice what the result is.
nhanes %>%
summarize(mean_active_days = mean(phys_active_days))
summarizeAdd na.rm = TRUE to make this work.
nhanes %>%
summarize(mean_active_days = mean(phys_active_days,
na.rm = TRUE))
summarizeYou can have multiple arguments in each usage of summarize.
nhanes %>%
summarize(mean_active_days = mean(phys_active_days, na.rm = TRUE),
median_active_days = median(phys_active_days, na.rm = TRUE),
number_of_responses = n())
group_bynhanes %>%
group_by(age_decade) %>%
summarize(mean_active_days = mean(phys_active_days,
na.rm = TRUE))
group_by examplegroup_by works for multiple groups.
nhanes %>%
group_by(age_decade, gender) %>%
summarize(mean_active_days = mean(phys_active_days,
na.rm = TRUE))
countIf you just want to count the number of things per group, you can use count.
nhanes %>%
count(age_decade)
countYou can also count by multiple groups.
nhanes %>%
count(age_decade, gender)
arrangearrange exampleR arranges in ascending order by default
nhanes %>%
group_by(age_decade, gender) %>%
summarize(mean_active_days = mean(phys_active_days,
na.rm = TRUE)) %>%
arrange(mean_active_days)
arrange exampleYou can also arrange in descending order
nhanes %>%
group_by(age_decade, gender) %>%
summarize(mean_active_days = mean(phys_active_days,
na.rm = TRUE)) %>%
arrange(desc(mean_active_days))
Sometimes you want your results in a crosstab. We’ll use the tabyl function in janitor package to make crosstabs automatically.
nhanes %>%
tabyl(gender, age_decade)
adorn_ functionsjanitor has a set of functions that all start with adorn_ that add a number of things to our crosstabs. You call them after tabyl.
nhanes %>%
tabyl(gender, age_decade) %>%
adorn_totals(c("row", "col"))
nhanes %>%
tabyl(gender, age_decade) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages()
nhanes %>%
tabyl(gender, age_decade) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages() %>%
adorn_pct_formatting()
nhanes %>%
tabyl(gender, age_decade) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages() %>%
adorn_pct_formatting() %>%
adorn_ns()
nhanes %>%
tabyl(gender, age_decade) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages() %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
adorn_title()
You can also do three (or more) way crosstabs automatically by adding more variables to the tabyl function.
nhanes %>%
tabyl(gender, age_decade, education) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages() %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
adorn_title()
## $`8th Grade`
## age_decade
## gender 0-9 10-19 20-29 30-39 40-49 50-59
## female 0.0% (0) 0.0% (0) 9.1% (19) 18.7% (39) 16.3% (34) 15.3% (32)
## male 0.0% (0) 0.0% (0) 7.4% (18) 14.0% (34) 22.3% (54) 14.0% (34)
## Total 0.0% (0) 0.0% (0) 8.2% (37) 16.2% (73) 19.5% (88) 14.6% (66)
##
## 60-69 70+ NA_ Total
## 12.4% (26) 17.7% (37) 10.5% (22) 100.0% (209)
## 16.9% (41) 10.3% (25) 14.9% (36) 100.0% (242)
## 14.9% (67) 13.7% (62) 12.9% (58) 100.0% (451)
##
## $`9 - 11th Grade`
## age_decade
## gender 0-9 10-19 20-29 30-39 40-49
## female 0.0% (0) 0.0% (0) 17.9% (72) 15.2% (61) 14.9% (60)
## male 0.0% (0) 0.0% (0) 20.6% (100) 16.7% (81) 22.2% (108)
## Total 0.0% (0) 0.0% (0) 19.4% (172) 16.0% (142) 18.9% (168)
##
## 50-59 60-69 70+ NA_ Total
## 18.7% (75) 12.4% (50) 12.9% (52) 8.0% (32) 100.0% (402)
## 18.3% (89) 9.7% (47) 9.1% (44) 3.5% (17) 100.0% (486)
## 18.5% (164) 10.9% (97) 10.8% (96) 5.5% (49) 100.0% (888)
##
## $`College Grad`
## age_decade
## gender 0-9 10-19 20-29 30-39 40-49
## female 0.0% (0) 0.0% (0) 14.8% (163) 21.2% (233) 25.7% (282)
## male 0.0% (0) 0.0% (0) 13.0% (130) 21.4% (214) 19.9% (199)
## Total 0.0% (0) 0.0% (0) 14.0% (293) 21.3% (447) 22.9% (481)
##
## 50-59 60-69 70+ NA_ Total
## 19.7% (217) 10.9% (120) 5.2% (57) 2.5% (27) 100.0% (1099)
## 20.0% (200) 16.0% (160) 6.4% (64) 3.2% (32) 100.0% (999)
## 19.9% (417) 13.3% (280) 5.8% (121) 2.8% (59) 100.0% (2098)
##
## $`High School`
## age_decade
## gender 0-9 10-19 20-29 30-39 40-49
## female 0.0% (0) 0.0% (0) 20.3% (156) 13.6% (105) 17.5% (135)
## male 0.0% (0) 0.0% (0) 21.0% (157) 15.7% (117) 22.5% (168)
## Total 0.0% (0) 0.0% (0) 20.6% (313) 14.6% (222) 20.0% (303)
##
## 50-59 60-69 70+ NA_ Total
## 15.1% (116) 13.8% (106) 12.5% (96) 7.3% (56) 100.0% (770)
## 20.7% (155) 10.0% (75) 5.9% (44) 4.1% (31) 100.0% (747)
## 17.9% (271) 11.9% (181) 9.2% (140) 5.7% (87) 100.0% (1517)
##
## $`Some College`
## age_decade
## gender 0-9 10-19 20-29 30-39 40-49
## female 0.0% (0) 0.0% (0) 22.6% (271) 20.0% (239) 14.0% (167)
## male 0.0% (0) 0.0% (0) 24.9% (266) 19.9% (213) 17.6% (188)
## Total 0.0% (0) 0.0% (0) 23.7% (537) 19.9% (452) 15.7% (355)
##
## 50-59 60-69 70+ NA_ Total
## 15.3% (183) 14.8% (177) 8.7% (104) 4.7% (56) 100.0% (1197)
## 19.0% (203) 10.8% (116) 5.8% (62) 2.1% (22) 100.0% (1070)
## 17.0% (386) 12.9% (293) 7.3% (166) 3.4% (78) 100.0% (2267)
##
## $NA_
## age_decade
## gender 0-9 10-19 20-29 30-39 40-49 50-59
## female 48.6% (653) 50.9% (684) 0.0% (0) 0.0% (0) 0.2% (3) 0.0% (0)
## male 51.4% (738) 48.1% (690) 0.3% (4) 0.1% (2) 0.0% (0) 0.0% (0)
## Total 50.1% (1391) 49.4% (1374) 0.1% (4) 0.1% (2) 0.1% (3) 0.0% (0)
##
## 60-69 70+ NA_ Total
## 0.1% (1) 0.1% (2) 0.0% (0) 100.0% (1343)
## 0.0% (0) 0.0% (0) 0.1% (2) 100.0% (1436)
## 0.0% (1) 0.1% (2) 0.1% (2) 100.0% (2779)
Sometimes you want to save the results of your work to a new data frame.
phys_activity_by_age <- nhanes %>%
group_by(age_decade) %>% #<<
summarize(mean_active_days = mean(phys_active_days,
na.rm = TRUE)) %>%
drop_na()
phys_activity_by_age